library(feather) #feather format for fastest i/o
library(data.table) 
library(lubridate)


#######################################
#####Step 1: I/O and initial formatting
##########################################
#voters <- fread("Input Files/AllNYSVoters.csv", header=FALSE)
voters <- read.delim("Input Files/idahofile/20180221_Statewide_w_History/VoterData.txt", sep="|", header=F)
dim(voters) #under 1m. not too worried about counties, actually. 
library(rvest)
file <- read_html("Input Files/idahofile/20180221_Statewide_w_History/Report.html") %>%
  html_table()
colnames(voters) <- file[[4]][-c(1:2),1]

#no need to subset by county here the way we did for NYC.
#set up voting. 
voters$voted16 <- voters$'VOTED 1' == "VOTED"
voters$voted12 <- voters$'VOTED 6' == "VOTED"
voters$voted14 <- voters$'VOTED 4' == "VOTED"
voters$voted10 <- voters$'VOTED 8' == "VOTED"
#much quicker than in NY file.

##Import Employees
thirteen <- read.csv("Input Files/idemployees/2013/state.csv",
                     stringsAsFactors=F)
fourteen <- read.csv("Input Files/idemployees//2014/state_2014.csv",
                     stringsAsFactors = F)
summary(thirteen$salary)
thirteen$year <- 2013
fourteen$year <- 2014
employees <- data.table(rbind(thirteen, fourteen)); dim(employees)

#Id for employees -- only unique for fiscal year/ employee pairs
employees$idEmployees <- 1:nrow(employees)

#Make a full name variable
employees$EmployeeFull <- paste(employees$first_name, employees$middle_initial, employees$last_name)
employees$EmployeeFirstLast <- paste(employees$first_name, employees$last_name)

setkey(employees, "name", "agency", "job_title") 

#what if we then reshape wide before the merge, so we have a sense of how many unique employees we're working with?
#let's do that next.
payroll <- employees[!(name==""),]; dim(employees); dim(payroll) #drop the few with no name

payroll[, employeekey := paste(name, agency, sep=" ")]
setkey(payroll, employeekey, pay_basis, ft_pt)

#before we reshape wide, see how many duplicates we're going to run into. 
payroll[, obscount := nrow(.SD), by=list(employeekey, pay_basis, year)]
table(payroll$obscount) 
duplicated <- payroll[obscount >1, ]; dim(duplicated)
#focus on hourly employees from here on out
#and then of remaining duplicates, we'll focus on the lowest hourly wage (so we can see if they'd have been treated at all.)
hourly <- payroll[pay_basis=="HOURLY", ]; dim(hourly) #limit to hourly
setkey(hourly, employeekey, year, salary)

hourly <- hourly[!salary < 7.25, ]; dim(hourly) #drop anybody below federal MW. (just a handful in here) 

hourly[, obscount := nrow(.SD), by=list(employeekey, year)] #now look for dupes again
table(hourly$obscount) #still plenty of duplicates; look at them
duplicated <- hourly[obscount >1, ]; dim(duplicated) #still some. keep lowest-paying job?

setkey(hourly, "employeekey", "year", "salary") #sort so lowest pay is first
hourly[, obsnumber := 1:nrow(.SD), by=list(employeekey, year)] #mark dupes
hourly <- hourly[obsnumber==1,] #keep first observation only
hourly[, obscount := nrow(.SD), by=list(employeekey, year)] #now look for dupes once more
table(hourly$obscount) #none. 

#okay, now reshape. 
employeeswide <- reshape(hourly, idvar = c("employeekey"), timevar = "year", v.names = c("job_title", "salary", "pay_basis","ft_pt", "county", "appt_type", "idEmployees"), direction = "wide")
dim(employeeswide) 

employeeswide[!is.na(county.2013), county := county.2013]
employeeswide[!is.na(county.2014), county := county.2014]

##############
#Step 3:The Merge
#########
voters <- data.table(voters)
setnames(voters, "FIRST NAME", "first_name")
setnames(voters, "LAST NAME", "last_name")
setnames(voters, "MIDDLE NAME", "middle_name")
voters[, middleinitial:= toupper(substr(middle_name, 0,1))] #line up format of middle initials before merging on them
voters[, county:= as.character(voters$'COUNTY NAME')]
##set keys
setkey(voters, last_name, first_name, middleinitial, county)
setkey(employeeswide, last_name, first_name)

df <- merge(employeeswide, voters, by.x= c("last_name", "first_name"), 
            by.y = c("last_name", "first_name"))
dim(df); dim(df[is.na(df$'VOTER ID')==T,]); dim(employeeswide)
voterfilematches <- nrow(df)

df[middleinitial == "", middleinitial := NA] #first, correct the ones that don't actually have middle initials
df[, middlenamemismatch:= 0]
df[!(is.na(middle_initial)) & !(is.na(middleinitial)) & middleinitial!= middle_initial , middlenamemismatch := 1]
df <- df[!(middlenamemismatch==1), ]
dim(df)

#note that in ID, we don't have employees' start dates, or voters who've been purged due to death, so can't pare down matches based on that.
#but can use age (don't have DOB) to discard matches to implausibly old people?

#might also try tie-breaking duplicate matches with county

######################################
###Step 7: Break Ties With Counties
###################################

df$countyMatch <- df$county.x == df$county.y

#what do we mean by tiebreaking?
#How about this: when there are duplicate obs for an employee, and that employee has a work borough location, then we should privilege matches where county matches borough (and drop other matches). but if any of those aren't true, we'll just keep all the matches? 
df[, obscount := nrow(.SD), by=list(employeekey)]
df[, employeehascountymatch := max(countyMatch), by=list(employeekey)] #mark if there is a row with countymatch
df[, employeehascounty := !is.na(.SD$county.x), by=list(employeekey)]
df[, countytiebreak := 0]
df[(employeehascounty & employeehascountymatch & obscount > 1)  & countyMatch==F, countytiebreak := 1] #mark rows to drop
table(df$countytiebreak) #drops about 4500 rows.

#df <- df[countytiebreak==0, ] #stop doing this? if so, just comment this line out. 


###################################
###Step 8: Break Ties With Age
###############################
#It seems  unlikely the elderly are working for the city. If there's a duplicate match, we can prefer the one
#that's younger. Maybe want to expand this to just delete all matches that over a certain age? come back and decide


df$Age <- df$AGE - 4 #there's no DOB in here, and the "AGE" is as of the date of the file (2018)
df$Over70 <- df$Age >70 #& df$LeaveStatusasofJune30 == "ACTIVE"
df[, obscount := nrow(.SD), by=list(employeekey)]
df[, matchesover70 := sum(.SD$Over70), by=list(employeekey)] #can see if it's less than total rows (so we know if there's a younger row for us to keep, from among the duplicates) 
df[, agetiebreak:= 0]
df[(obscount > 1 & (matchesover70 < obscount) & Over70==T), agetiebreak:= 1] #mark the ones to drop 
table(df$agetiebreak) #drops about 300 people, or 750 if not breaking ties by county
df <- df[agetiebreak==0, ]; dim(df)

####now clean up voting, collapse any further voting duplicates, stick it back to our baseline dataset (employeeswide) to ensure we haven't lost anyone, and save.

#want to collapse further duplicates by averaging across voting records
df.hourly <- df
df.hourly[, observations:= .N, by=list(employeekey)]
table(df.hourly$observations)#mostly unique, but some high-duplicates

setkey(df.hourly, employeekey)

votes <- df.hourly[,list(Voted16=rbinom(1,1, p=mean(voted16)), Voted12=rbinom(1,1, p=mean(voted12)), Voted14=rbinom(1,1, p=mean(voted14)), Voted10=rbinom(1,1, p=mean(voted10)), numrecords = nrow(.SD)), by=list(employeekey)]
votes[, mergedtovoterfile:=1]
collapsedmatches <- nrow(votes)

hourlyfinal <- merge(employeeswide, votes, by=c("employeekey"), all.x=T)
dim(hourlyfinal); dim(employeeswide); dim(votes)

#recode missing is not voted
hourlyfinal$Voted16[is.na(hourlyfinal$Voted16)] <- 0
hourlyfinal$Voted12[is.na(hourlyfinal$Voted12)] <- 0
hourlyfinal$Voted14[is.na(hourlyfinal$Voted14)] <- 0
hourlyfinal$Voted10[is.na(hourlyfinal$Voted10)] <- 0

summary(hourlyfinal$Voted16); summary(hourlyfinal$Voted12)
summary(hourlyfinal$Voted14); summary(hourlyfinal$Voted10)
percentagematchedtoVF <- round(100*(sum(hourlyfinal$mergedtovoterfile, na.rm=T) / nrow(hourlyfinal)))
percentagematchedtoVF#68%-- reasonable?

write_feather(hourlyfinal, "Intermediate Files/IDmergedfile.feather") 
